; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false  -verify-machineinstrs < %s | FileCheck %s

declare void @normal_callee();
declare void @streaming_callee() "aarch64_pstate_sm_enabled";
declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible";

define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_streaming_callee:
; CHECK:       // %bb.0:
; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    bl streaming_compatible_callee
; CHECK-NEXT:    bl streaming_compatible_callee
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
; CHECK-NEXT:    ret

  call void @streaming_compatible_callee();
  call void @streaming_compatible_callee();
  ret void;
}

; Test that a streaming body and streaming interface, no smstart/smstop are emitted,
; because the function already is in streaming mode upon entry.
define void @streaming_and_locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_enabled" "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: streaming_and_locally_streaming_caller_streaming_callee:
; CHECK:       // %bb.0:
; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT:    bl streaming_callee
; CHECK-NEXT:    bl streaming_callee
; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT:    ret
  call void @streaming_callee();
  call void @streaming_callee();
  ret void;
}

define void @locally_streaming_multiple_exit(i64 %cond) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_multiple_exit:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    cmp x0, #1
; CHECK-NEXT:    b.ne .LBB2_2
; CHECK-NEXT:  // %bb.1: // %if.else
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
; CHECK-NEXT:    ret
; CHECK-NEXT:  .LBB2_2: // %if.end
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
; CHECK-NEXT:    ret

entry:
  %tobool = icmp eq i64 %cond, 1
  br i1 %tobool, label %if.else, label %if.end

if.else:
  ret void;

if.end:
  ret void;
}

; Do a fixed-width vector add on a NEON vector.
; This tests that:
; * Incoming vector in v0.d isn't clobbered by the change in streaming mode.
; * Result vector is correctly preserved after smstop.
define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_no_callee:
; CHECK:       // %bb.0:
; CHECK-NEXT:    sub sp, sp, #80
; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    adrp x8, .LCPI3_0
; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI3_0]
; CHECK-NEXT:    add v0.2d, v1.2d, v0.2d
; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT:    add sp, sp, #80
; CHECK-NEXT:    ret

  %add = add <2 x i64> %a, <i64 41, i64 42>;
  ret <2 x i64> %add;
}

; Test that we use the interface (not the function's body) to determine what
; streaming-mode to enter the callee. In this case the interface is normal, so
; pstate.sm must be 0 on entry and is 0 upon return from the callee.
define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_locally_streaming_callee:
; CHECK:       // %bb.0:
; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    bl locally_streaming_caller_streaming_callee
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
; CHECK-NEXT:    ret

  call void @locally_streaming_caller_streaming_callee();
  ret void;
}

;
; Test that a locally streaming function correctly retains the
; argument/result registers, because smstart/smstop instructions that are
; inserted to implement the arm_locally_streaming attribute thrashes the
; vector register contents.
;

define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_compatible_callee_vec_args_ret:
; CHECK:       // %bb.0:
; CHECK-NEXT:    sub sp, sp, #96
; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT:    bl streaming_compatible_callee_vec_args_ret
; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT:    add sp, sp, #96
; CHECK-NEXT:    ret
  %res = call <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_compatible"
  ret <2 x i64> %res;
}

declare <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64>) "aarch64_pstate_sm_compatible"

define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct_arg_ret({<2 x i64>, <2 x i64>} %arg) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_compatible_callee_struct_arg_ret:
; CHECK:       // %bb.0:
; CHECK-NEXT:    sub sp, sp, #112
; CHECK-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    bl streaming_compatible_callee_vec_arg_struct_ret
; CHECK-NEXT:    stp q1, q0, [sp] // 32-byte Folded Spill
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
; CHECK-NEXT:    add sp, sp, #112
; CHECK-NEXT:    ret
  %v1.arg = extractvalue {<2 x i64>, <2 x i64>} %arg, 1
  %res = call {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64> %v1.arg) "aarch64_pstate_sm_compatible"
  ret {<2 x i64>, <2 x i64>} %res;
}

declare {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64>) "aarch64_pstate_sm_compatible"

; Test that we use `addsvl` for allocating any stack space for locals before `smstart`,
; such that the correct amount of stack space is allocated.
define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body" {
; CHECK-LABEL: locally_streaming_caller_alloca:
; CHECK:       // %bb.0:
; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT:    addsvl sp, sp, #-1
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    mov x0, sp
; CHECK-NEXT:    bl use_ptr
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    addsvl sp, sp, #1
; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
; CHECK-NEXT:    ret
  %alloca = alloca <vscale x 4 x i32>
  call void @use_ptr(ptr %alloca) "aarch64_pstate_sm_compatible"
  ret void
}

declare void @use_ptr(ptr) "aarch64_pstate_sm_compatible"

define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_body" {
; CHECK-LABEL: call_to_intrinsic_without_chain:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
; CHECK-NEXT:    str d0, [sp, #72] // 8-byte Folded Spill
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    ldr d0, [sp, #72] // 8-byte Folded Reload
; CHECK-NEXT:    bl cos
; CHECK-NEXT:    str d0, [sp, #72] // 8-byte Folded Spill
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    ldr d0, [sp, #72] // 8-byte Folded Reload
; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
; CHECK-NEXT:    ret
entry:
  %0 = call fast double @llvm.cos.f64(double %x)
  ret double %0
}

declare double @llvm.cos.f64(double)


define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstate_sm_body" {
; CHECK-LABEL: test_arg_survives_loop:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    sub sp, sp, #80
; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:  .LBB9_1: // %for.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    subs w0, w0, #1
; CHECK-NEXT:    b.ne .LBB9_1
; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
; CHECK-NEXT:    fmov s0, #1.00000000
; CHECK-NEXT:    ldr s1, [sp, #12] // 4-byte Folded Reload
; CHECK-NEXT:    fadd s0, s1, s0
; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
; CHECK-NEXT:    add sp, sp, #80
; CHECK-NEXT:    ret
entry:
  br label %for.body

for.body:
  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
  %inc = add nuw nsw i32 %i.02, 1
  %exitcond.not = icmp eq i32 %inc, %N
  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body

for.cond.cleanup:
  %add = fadd float %arg, 1.000000e+00
  ret float %add

}
